import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import plotly.express as px
raw_movies_df = pd.read_csv('../csv/MoviesOnStreamingPlatforms_updated.csv', index_col=1)
raw_movies_df.drop(columns='Unnamed: 0', inplace=True)
raw_movies_df.head()
| Title | Year | Age | IMDb | Rotten Tomatoes | Netflix | Hulu | Prime Video | Disney+ | Type | Directors | Genres | Country | Language | Runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||
| 1 | Inception | 2010 | 13+ | 8.8 | 87% | 1 | 0 | 0 | 0 | 0 | Christopher Nolan | Action,Adventure,Sci-Fi,Thriller | United States,United Kingdom | English,Japanese,French | 148.0 |
| 2 | The Matrix | 1999 | 18+ | 8.7 | 87% | 1 | 0 | 0 | 0 | 0 | Lana Wachowski,Lilly Wachowski | Action,Sci-Fi | United States | English | 136.0 |
| 3 | Avengers: Infinity War | 2018 | 13+ | 8.5 | 84% | 1 | 0 | 0 | 0 | 0 | Anthony Russo,Joe Russo | Action,Adventure,Sci-Fi | United States | English | 149.0 |
| 4 | Back to the Future | 1985 | 7+ | 8.5 | 96% | 1 | 0 | 0 | 0 | 0 | Robert Zemeckis | Adventure,Comedy,Sci-Fi | United States | English | 116.0 |
| 5 | The Good, the Bad and the Ugly | 1966 | 18+ | 8.8 | 97% | 1 | 0 | 1 | 0 | 0 | Sergio Leone | Western | Italy,Spain,West Germany | Italian | 161.0 |
Make the column names uniform
raw_movies_df.columns
Index(['Title', 'Year', 'Age', 'IMDb', 'Rotten Tomatoes', 'Netflix', 'Hulu',
'Prime Video', 'Disney+', 'Type', 'Directors', 'Genres', 'Country',
'Language', 'Runtime'],
dtype='object')
raw_movies_df.columns = map(str.lower, raw_movies_df.columns)
Making all column names lower case for ease of use. Can also use: df.rename(str.lower, axis='columns')
raw_movies_df = raw_movies_df.rename(columns={'rotten tomatoes':'rotten_tomatoes','prime video': 'prime_video'})
raw_movies_df
| title | year | age | imdb | rotten_tomatoes | netflix | hulu | prime_video | disney+ | type | directors | genres | country | language | runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||
| 1 | Inception | 2010 | 13+ | 8.8 | 87% | 1 | 0 | 0 | 0 | 0 | Christopher Nolan | Action,Adventure,Sci-Fi,Thriller | United States,United Kingdom | English,Japanese,French | 148.0 |
| 2 | The Matrix | 1999 | 18+ | 8.7 | 87% | 1 | 0 | 0 | 0 | 0 | Lana Wachowski,Lilly Wachowski | Action,Sci-Fi | United States | English | 136.0 |
| 3 | Avengers: Infinity War | 2018 | 13+ | 8.5 | 84% | 1 | 0 | 0 | 0 | 0 | Anthony Russo,Joe Russo | Action,Adventure,Sci-Fi | United States | English | 149.0 |
| 4 | Back to the Future | 1985 | 7+ | 8.5 | 96% | 1 | 0 | 0 | 0 | 0 | Robert Zemeckis | Adventure,Comedy,Sci-Fi | United States | English | 116.0 |
| 5 | The Good, the Bad and the Ugly | 1966 | 18+ | 8.8 | 97% | 1 | 0 | 1 | 0 | 0 | Sergio Leone | Western | Italy,Spain,West Germany | Italian | 161.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 16740 | The Ghosts of Buxley Hall | 1980 | NaN | 6.2 | NaN | 0 | 0 | 0 | 1 | 0 | Bruce Bilson | Comedy,Family,Fantasy,Horror | United States | English | 120.0 |
| 16741 | The Poof Point | 2001 | 7+ | 4.7 | NaN | 0 | 0 | 0 | 1 | 0 | Neal Israel | Comedy,Family,Sci-Fi | United States | English | 90.0 |
| 16742 | Sharks of Lost Island | 2013 | NaN | 5.7 | NaN | 0 | 0 | 0 | 1 | 0 | Neil Gelinas | Documentary | United States | English | NaN |
| 16743 | Man Among Cheetahs | 2017 | NaN | 6.6 | NaN | 0 | 0 | 0 | 1 | 0 | Richard Slater-Jones | Documentary | United States | English | NaN |
| 16744 | In Beaver Valley | 1950 | NaN | NaN | NaN | 0 | 0 | 0 | 1 | 0 | James Algar | Documentary,Short,Family | United States | English | 32.0 |
16744 rows × 15 columns
raw_movies_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 16744 entries, 1 to 16744 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 title 16744 non-null object 1 year 16744 non-null int64 2 age 7354 non-null object 3 imdb 16173 non-null float64 4 rotten_tomatoes 5158 non-null object 5 netflix 16744 non-null int64 6 hulu 16744 non-null int64 7 prime_video 16744 non-null int64 8 disney+ 16744 non-null int64 9 type 16744 non-null int64 10 directors 16018 non-null object 11 genres 16469 non-null object 12 country 16309 non-null object 13 language 16145 non-null object 14 runtime 16152 non-null float64 dtypes: float64(2), int64(6), object(7) memory usage: 2.0+ MB
raw_movies_df['rotten_tomatoes'] = raw_movies_df['rotten_tomatoes'].str.replace('%', '')
raw_movies_df.head()
| title | year | age | imdb | rotten_tomatoes | netflix | hulu | prime_video | disney+ | type | directors | genres | country | language | runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ID | |||||||||||||||
| 1 | Inception | 2010 | 13+ | 8.8 | 87 | 1 | 0 | 0 | 0 | 0 | Christopher Nolan | Action,Adventure,Sci-Fi,Thriller | United States,United Kingdom | English,Japanese,French | 148.0 |
| 2 | The Matrix | 1999 | 18+ | 8.7 | 87 | 1 | 0 | 0 | 0 | 0 | Lana Wachowski,Lilly Wachowski | Action,Sci-Fi | United States | English | 136.0 |
| 3 | Avengers: Infinity War | 2018 | 13+ | 8.5 | 84 | 1 | 0 | 0 | 0 | 0 | Anthony Russo,Joe Russo | Action,Adventure,Sci-Fi | United States | English | 149.0 |
| 4 | Back to the Future | 1985 | 7+ | 8.5 | 96 | 1 | 0 | 0 | 0 | 0 | Robert Zemeckis | Adventure,Comedy,Sci-Fi | United States | English | 116.0 |
| 5 | The Good, the Bad and the Ugly | 1966 | 18+ | 8.8 | 97 | 1 | 0 | 1 | 0 | 0 | Sergio Leone | Western | Italy,Spain,West Germany | Italian | 161.0 |
#converting to number, run as float first
raw_movies_df['rotten_tomatoes'] = raw_movies_df['rotten_tomatoes'].astype('float')
raw_movies_df['rotten_tomatoes'] = raw_movies_df['rotten_tomatoes'].astype('Int64')
raw_movies_df['rotten_tomatoes']
ID
1 87
2 87
3 84
4 96
5 97
...
16740 <NA>
16741 <NA>
16742 <NA>
16743 <NA>
16744 <NA>
Name: rotten_tomatoes, Length: 16744, dtype: Int64
movie_df = raw_movies_df.copy()
movie_df.describe()
| year | imdb | rotten_tomatoes | netflix | hulu | prime_video | disney+ | type | runtime | |
|---|---|---|---|---|---|---|---|---|---|
| count | 16744.000000 | 16173.000000 | 5158.000000 | 16744.000000 | 16744.000000 | 16744.000000 | 16744.000000 | 16744.0 | 16152.000000 |
| mean | 2003.014035 | 5.902751 | 65.428461 | 0.212613 | 0.053930 | 0.737817 | 0.033684 | 0.0 | 93.413447 |
| std | 20.674321 | 1.347867 | 26.614496 | 0.409169 | 0.225886 | 0.439835 | 0.180419 | 0.0 | 28.219222 |
| min | 1902.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 1.000000 |
| 25% | 2000.000000 | 5.100000 | 44.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 82.000000 |
| 50% | 2012.000000 | 6.100000 | 71.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0 | 92.000000 |
| 75% | 2016.000000 | 6.900000 | 88.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.0 | 104.000000 |
| max | 2020.000000 | 9.300000 | 100.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.0 | 1256.000000 |
total = movie_df.isnull().sum().sort_values(ascending=False)
percent = (movie_df.isnull().sum()/movie_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data
| Total | Percent | |
|---|---|---|
| rotten_tomatoes | 11586 | 0.691949 |
| age | 9390 | 0.560798 |
| directors | 726 | 0.043359 |
| language | 599 | 0.035774 |
| runtime | 592 | 0.035356 |
| imdb | 571 | 0.034102 |
| country | 435 | 0.025979 |
| genres | 275 | 0.016424 |
| type | 0 | 0.000000 |
| disney+ | 0 | 0.000000 |
| prime_video | 0 | 0.000000 |
| hulu | 0 | 0.000000 |
| netflix | 0 | 0.000000 |
| year | 0 | 0.000000 |
| title | 0 | 0.000000 |
netflix_count = len(movie_df[movie_df['netflix'] == 1].index)
hulu_count = len(movie_df[movie_df['hulu'] == 1].index)
prime_count = len(movie_df[movie_df['prime_video'] == 1].index)
disney_count = len(movie_df[movie_df['disney+'] == 1].index)
label = ['Netflix', 'Hulu', 'Prime Video', 'Disney']
count = [netflix_count, hulu_count, prime_count, disney_count]
platform = pd.DataFrame(
{'platform': label,
'movie_count': count,
})
platform
| platform | movie_count | |
|---|---|---|
| 0 | Netflix | 3560 |
| 1 | Hulu | 903 |
| 2 | Prime Video | 12354 |
| 3 | Disney | 564 |
fig = px.pie(platform, names='platform', values='movie_count')
fig.update_traces(rotation=45, pull=[0.1,0.03,0.03,0.03,0.03], textinfo='percent+label', title='Movie count per platform')
fig.show()
yearly_count = movie_df.groupby('year')['title'].count().reset_index().rename(columns={'title':"movie_count"})
fig = px.bar(yearly_count, x='year', y="movie_count", color='movie_count', height=600)
fig.show()
movie_df['genres'].value_counts().head(5)
Drama 1341 Documentary 1229 Comedy 1040 Comedy,Drama 446 Horror 436 Name: genres, dtype: int64
top5_genres = ['Drama','Documentary','Comedy', 'Comedy,Drama','Horror']
table = movie_df.loc[:, ['year', 'genres', 'imdb']]
table['avg_rating'] = table.groupby([table.genres, table.year])['imdb'].transform('mean')
table.drop('imdb', axis=1, inplace=True)
table = table[(table.year > 2009) & (table.year<2020)]
table = table.loc[table['genres'].isin(top5_genres)]
table = table.sort_values('year')
fig = px.bar(table, x='genres', y='avg_rating', animation_frame='year', animation_group='genres',
color='genres', hover_name='genres', range_y=[0,10])
fig.update_layout(showlegend=False)
fig.show()
top_movies_imdb = movie_df[movie_df['imdb']>8.5][['title', 'directors', 'rotten_tomatoes', 'imdb']].sort_values(by=['rotten_tomatoes'], ascending=False)
top_movies_imdb
| title | directors | rotten_tomatoes | imdb | |
|---|---|---|---|---|
| ID | ||||
| 4474 | Stop Making Sense | Jonathan Demme | 100 | 8.6 |
| 4663 | Tom Petty and the Heartbreakers: Runnin' Down ... | Peter Bogdanovich | 100 | 8.6 |
| 3563 | Parasite | Bong Joon Ho | 99 | 8.6 |
| 5 | The Good, the Bad and the Ugly | Sergio Leone | 97 | 8.8 |
| 3562 | GoodFellas | Martin Scorsese | 96 | 8.7 |
| ... | ... | ... | ... | ... |
| 11161 | Draw Hard | Jon Nix | <NA> | 8.7 |
| 11208 | Along Recovery | Justin J. Springer | <NA> | 8.6 |
| 11874 | Ruby's Studio: the Feelings Show | Jason Docter,Matt Docter | <NA> | 8.8 |
| 11911 | Second Chance By Betrayal | Terrence Weasel Smith | <NA> | 8.7 |
| 15176 | Shred: The Story of Asher Bradshaw | Kathy Herndl | <NA> | 8.8 |
97 rows × 4 columns
top_movies_rt = movie_df[movie_df['rotten_tomatoes']>90.0][['title', 'directors', 'rotten_tomatoes', 'imdb']].sort_values(by=['rotten_tomatoes'], ascending=False)
top_movies_rt
| title | directors | rotten_tomatoes | imdb | |
|---|---|---|---|---|
| ID | ||||
| 16481 | Marvel Rising: Secret Warriors | Alfred Gimeno,Eric Radomski | 100 | 5.2 |
| 1332 | Attacking the Devil: Harold Evans and the Last... | David Morris,Jacqui Morris | 100 | 8.1 |
| 5214 | Top Spin | Sara Newens,Mina T. Son | 100 | 6.9 |
| 5228 | Bolivia | Israel Adrián Caetano | 100 | 7.3 |
| 5229 | The Story of Louis Pasteur | William Dieterle | 100 | 7.3 |
| ... | ... | ... | ... | ... |
| 3948 | All Square | John Hyams | 91 | 5.8 |
| 6111 | Werewolf | Freddie Francis | 91 | 6.0 |
| 6082 | Michael H. – Profession: Director | Yves Montmayeur | 91 | 7.0 |
| 5084 | Cutter's Way | Ivan Passer | 91 | 6.9 |
| 959 | Nobody Speak: Trials of the Free Press | Brian Knappenberger | 91 | 6.5 |
1035 rows × 4 columns
movies_by_country = movie_df.groupby('country')['title'].count().reset_index().sort_values('title',ascending = False).head(10).rename(columns = {'title':'movie_count'})
fig = px.pie(movies_by_country,names='country', values='movie_count')
fig.update_traces(rotation=180, pull=[0.1,0.03,0.03,0.03,0.03],textinfo="percent+label", title='Movie Count per Country')
fig.update_layout(showlegend=False)
fig.show()
count_by_language = movie_df.groupby('language')['title'].count().reset_index().sort_values('title', ascending=False).head(10).rename(columns={'title': 'movie_count'})
fig = px.bar(count_by_language, x='language', y='movie_count', color='movie_count', height=600)
fig.show()
top_runtime = movie_df.sort_values('runtime', ascending=False).head(10)
fig = px.bar(top_runtime, x='title', y='runtime', color='runtime', height=600)
fig.show()
plt.figure(figsize=(15,7))
chains=movie_df['directors'].value_counts()[:20]
sns.barplot(x=chains, y=chains.index, palette='Set1')
plt.title('Most movies made by director', size=20, pad=20)
plt.xlabel('counts', size=15);